import numpy as np
import pandas as pd
import datetime as dt
import panel as pn
import hvplot.pandas
import matplotlib.pyplot as plt
import seaborn as sns
import cufflinks as cf
pd.options.plotting.backend = "hvplot"
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
# init_notebook_mode(connected=True)
# cf.go_offline()
from panel.template import DefaultTheme
pn.extension()
path = "/Users/glebsokolov/Downloads/Online Retail.xlsx"
df = pd.read_excel(path)
df.head(5)
df.info()
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | |
|---|---|---|---|---|---|---|---|---|
| 0 | 536365 | 85123A | WHITE HANGING HEART T-LIGHT HOLDER | 6 | 2010-12-01 08:26:00 | 2.55 | 17850.0 | United Kingdom |
| 1 | 536365 | 71053 | WHITE METAL LANTERN | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom |
| 2 | 536365 | 84406B | CREAM CUPID HEARTS COAT HANGER | 8 | 2010-12-01 08:26:00 | 2.75 | 17850.0 | United Kingdom |
| 3 | 536365 | 84029G | KNITTED UNION FLAG HOT WATER BOTTLE | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom |
| 4 | 536365 | 84029E | RED WOOLLY HOTTIE WHITE HEART. | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom |
<class 'pandas.core.frame.DataFrame'> RangeIndex: 541909 entries, 0 to 541908 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 InvoiceNo 541909 non-null object 1 StockCode 541909 non-null object 2 Description 540455 non-null object 3 Quantity 541909 non-null int64 4 InvoiceDate 541909 non-null datetime64[ns] 5 UnitPrice 541909 non-null float64 6 CustomerID 406829 non-null float64 7 Country 541909 non-null object dtypes: datetime64[ns](1), float64(2), int64(1), object(4) memory usage: 33.1+ MB
To fasten loading of this dataset in future, convert it to feather format.
df[['StockCode', 'InvoiceNo', 'Description']] = df[['StockCode', 'InvoiceNo', 'Description']].astype(str)
df.to_feather('onlineRetail.feather')
df = pd.read_feather('onlineRetail.feather')
df.head(10).hvplot.table()
df.isnull().sum()
df = df.dropna(subset=["CustomerID"])
df.isnull().sum().sum()
InvoiceNo 0 StockCode 0 Description 0 Quantity 0 InvoiceDate 0 UnitPrice 0 CustomerID 135080 Country 0 dtype: int64
0
df.duplicated().sum()
df = df.drop_duplicates()
df.duplicated().sum()
5225
0
df.describe().hvplot.table()
Let's get rid of ridiculous values.
df = df[(df["Quantity"] > 0) & (df["UnitPrice"] > 0)]
df.describe().hvplot.table()
df.shape
(530104, 8)
For cohort analysis, there are a few labels that we have to create:
def get_month(x) : return dt.datetime(x.year,x.month,1)
df['InvoiceMonth'] = df['InvoiceDate'].apply(get_month)
grouping = df.groupby('CustomerID')['InvoiceMonth']
df['CohortMonth'] = grouping.transform('min')
df.tail().hvplot.table(width=1000)
def get_month_int (dframe,column):
year = dframe[column].dt.year
month = dframe[column].dt.month
day = dframe[column].dt.day
return year, month , day
invoice_year,invoice_month,_ = get_month_int(df,'InvoiceMonth')
cohort_year,cohort_month,_ = get_month_int(df,'CohortMonth')
year_diff = invoice_year - cohort_year
month_diff = invoice_month - cohort_month
df['CohortIndex'] = year_diff * 12 + month_diff + 1
#Count monthly active customers from each cohort
grouping = df.groupby(['CohortMonth', 'CohortIndex'])
cohort_data = grouping['CustomerID'].apply(pd.Series.nunique)
# Return number of unique elements in the object.
cohort_data = cohort_data.reset_index()
cohort_counts = cohort_data.pivot(index='CohortMonth',columns='CohortIndex',values='CustomerID')
cohort_counts.hvplot.table()
# Retention table
cohort_size = cohort_counts.iloc[:,0]
retention = cohort_counts.divide(cohort_size,axis=0) #axis=0 to ensure the divide along the row axis
(retention.round(3) * 100).hvplot.table() #to show the number as percentage
retention.plot(kind='heatmap', title='Retention Rates', width=600)
#Build the heatmap
plt.figure(figsize=(15, 8))
plt.title('Retention rates')
sns.heatmap(data=retention,annot = True,fmt = '.0%',vmin = 0.0,vmax = 0.5,cmap="BuPu_r")
plt.show()
<Figure size 1080x576 with 0 Axes>
Text(0.5, 1.0, 'Retention rates')
<AxesSubplot:title={'center':'Retention rates'}, xlabel='CohortIndex', ylabel='CohortMonth'>
#Average quantity for each cohort
grouping = df.groupby(['CohortMonth', 'CohortIndex'])
cohort_data = grouping['Quantity'].mean()
cohort_data = cohort_data.reset_index()
average_quantity = cohort_data.pivot(index='CohortMonth',columns='CohortIndex',values='Quantity')
average_quantity.round(1).hvplot.table()
average_quantity.index = average_quantity.index.date
#Build the heatmap
plt.figure(figsize=(15, 8))
plt.title('Average quantity for each cohort')
sns.heatmap(data=average_quantity,annot = True,vmin = 0.0,vmax =20,cmap="BuGn_r")
plt.show()
<Figure size 1080x576 with 0 Axes>
Text(0.5, 1.0, 'Average quantity for each cohort')
<AxesSubplot:title={'center':'Average quantity for each cohort'}, xlabel='CohortIndex'>
#New Total Sum Column
df['TotalSum'] = df['UnitPrice']* df['Quantity']
#Data preparation steps
print('Min Invoice Date:',df.InvoiceDate.dt.date.min(),'max Invoice Date:',
df.InvoiceDate.dt.date.max())
df.head(3).hvplot.table()
Min Invoice Date: 2010-12-01 max Invoice Date: 2011-12-09
snapshot_date = df['InvoiceDate'].max() + dt.timedelta(days=1)
snapshot_date
#The last day of purchase in total is 09 DEC, 2011. To calculate the day periods,
#let's set one day after the last one,or
#10 DEC as a snapshot_date. We will cound the diff days with snapshot_date.
Timestamp('2011-12-10 12:50:00')
# Calculate RFM metrics
rfm = df.groupby(['CustomerID']).agg({'InvoiceDate': lambda x : (snapshot_date - x.max()).days,
'InvoiceNo':'count','TotalSum': 'sum'})
#Rename columns
rfm.rename(columns={'InvoiceDate':'Recency','InvoiceNo':'Frequency','TotalSum':'MonetaryValue'}
,inplace= True)
#Final RFM values
rfm.head().hvplot.table()
Note That :
We will rate "Recency" customer who have been active more recently better than the less recent customer,because each company wants its customers to be recent
We will rate "Frequency" and "Monetary Value" higher label because we want Customer to spend more money and visit more often(that is different order than recency).
#Building RFM segments
r_labels =range(4,0,-1)
f_labels=range(1,5)
m_labels=range(1,5)
r_quartiles = pd.qcut(rfm['Recency'], q=4, labels = r_labels)
f_quartiles = pd.qcut(rfm['Frequency'],q=4, labels = f_labels)
m_quartiles = pd.qcut(rfm['MonetaryValue'],q=4,labels = m_labels)
rfm = rfm.assign(R=r_quartiles,F=f_quartiles,M=m_quartiles)
# Build RFM Segment and RFM Score
def add_rfm(x) : return str(int(x['R'])) + str(int(x['F'])) + str(int(x['M']))
rfm['RFM_Segment'] = rfm.apply(add_rfm,axis=1 )
rfm['RFM_Score'] = rfm[['R','F','M']].sum(axis=1)
rfm.head().hvplot.table()
The Result is a Table which has a row for each customer with their RFM
rfm.groupby(['RFM_Segment']).size().sort_values(ascending=False)[:5]
#Select bottom RFM segment "111" and view top 5 rows
rfm[rfm['RFM_Segment']=='111'].head().hvplot.table()
RFM_Segment 444 447 111 385 344 217 122 206 211 179 dtype: int64
rfm.groupby("RFM_Score").agg(
{"Recency": "mean", "Frequency": "mean", "MonetaryValue": ["mean", "count"]}
).round(1)
def segments(df):
if df["RFM_Score"] > 9:
return "Gold"
elif (df["RFM_Score"] > 5) and (df["RFM_Score"] <= 9):
return "Sliver"
else:
return "Bronze"
rfm["General_Segment"] = rfm.apply(segments, axis=1)
rfm.groupby("General_Segment").agg(
{"Recency": "mean", "Frequency": "mean", "MonetaryValue": ["mean", "count"]}
).round(1)
| Recency | Frequency | MonetaryValue | ||
|---|---|---|---|---|
| mean | mean | mean | count | |
| RFM_Score | ||||
| 3 | 260.8 | 8.2 | 159.3 | 385 |
| 4 | 175.4 | 13.6 | 239.4 | 382 |
| 5 | 154.3 | 21.4 | 368.5 | 516 |
| 6 | 96.5 | 27.9 | 825.7 | 460 |
| 7 | 79.1 | 38.5 | 755.5 | 459 |
| 8 | 64.6 | 56.8 | 994.1 | 454 |
| 9 | 46.0 | 80.0 | 1801.4 | 417 |
| 10 | 32.1 | 112.3 | 2049.4 | 426 |
| 11 | 21.1 | 187.7 | 4083.2 | 392 |
| 12 | 7.3 | 374.4 | 9319.2 | 447 |
| Recency | Frequency | MonetaryValue | ||
|---|---|---|---|---|
| mean | mean | mean | count | |
| General_Segment | ||||
| Bronze | 192.6 | 15.2 | 267.3 | 1283 |
| Gold | 19.9 | 228.3 | 5248.5 | 1265 |
| Sliver | 72.2 | 50.1 | 1077.7 | 1790 |
We must check these Key k-means assumptions before we implement our Kmeans Clustering Algo
rfm_rfm = rfm[['Recency','Frequency','MonetaryValue']]
print(rfm_rfm.describe())
Recency Frequency MonetaryValue count 4338.000000 4338.000000 4338.000000 mean 92.536422 91.720609 2054.266460 std 100.014169 228.785094 8989.230441 min 1.000000 1.000000 3.750000 25% 18.000000 17.000000 307.415000 50% 51.000000 41.000000 674.485000 75% 142.000000 100.000000 1661.740000 max 374.000000 7847.000000 280206.020000
From this table, we find this Problem: Mean and Variance are not Equal
Soluation: Scaling variables by using a scaler from scikit-learn library
theme = 'ggplot2'
rfm['Recency'].plot(kind='hist') + rfm['Frequency'].plot(kind='hist') + rfm['MonetaryValue'].plot(kind='hist')
Also, there is another Problem: UnSymmetric distribution of variables (data skewed)
Soluation:Logarithmic transformation (positive values only) will manage skewness
#Unskew the data with log transformation
rfm_log = rfm[['Recency', 'Frequency', 'MonetaryValue']].apply(np.log, axis = 1).round(3)
rfm_log['Recency'].plot(kind='hist') + rfm_log['Frequency'].plot(kind='hist') + rfm_log['MonetaryValue'].plot(kind='hist')
#Normalize the variables with StandardScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(rfm_log)
#Store it separately for clustering
rfm_normalized= scaler.transform(rfm_log)
pd.DataFrame(rfm_normalized).describe()
StandardScaler()
| 0 | 1 | 2 | |
|---|---|---|---|
| count | 4.338000e+03 | 4.338000e+03 | 4.338000e+03 |
| mean | 7.862160e-17 | -2.096576e-16 | 6.355246e-16 |
| std | 1.000115e+00 | 1.000115e+00 | 1.000115e+00 |
| min | -2.630421e+00 | -2.775156e+00 | -4.179095e+00 |
| 25% | -6.126634e-01 | -6.385864e-01 | -6.843467e-01 |
| 50% | 1.148465e-01 | 2.583917e-02 | -6.090769e-02 |
| 75% | 8.297890e-01 | 6.978064e-01 | 6.543418e-01 |
| max | 1.505633e+00 | 3.988259e+00 | 4.721171e+00 |
Choosing a Number of Clusters
Methods to define the number of clusters
Elbow criterion method
from sklearn.cluster import KMeans
# First : Get the Best KMeans
ks = range(1, 8)
inertias = []
for k in ks:
# Create a KMeans clusters
kc = KMeans(n_clusters=k, random_state=1)
kc.fit(rfm_normalized)
inertias.append(kc.inertia_)
pd.DataFrame({"ks": ks, "inertias": inertias}).plot(
x="ks",
y="inertias",
kind="line",
title="What is the Best Number for KMeans ?",
width=800,
)
KMeans(n_clusters=1, random_state=1)
KMeans(n_clusters=2, random_state=1)
KMeans(n_clusters=3, random_state=1)
KMeans(n_clusters=4, random_state=1)
KMeans(n_clusters=5, random_state=1)
KMeans(n_clusters=6, random_state=1)
KMeans(n_clusters=7, random_state=1)
# clustering
kc = KMeans(n_clusters= 3, random_state=1)
kc.fit(rfm_normalized)
#Create a cluster label column in the original DataFrame
cluster_labels = kc.labels_
#Calculate average RFM values and size for each cluster:
rfm_rfm_k3 = rfm_rfm.assign(K_Cluster = cluster_labels)
#Calculate average RFM values and sizes for each cluster:
rfm_rfm_k3.groupby('K_Cluster').agg({'Recency': 'mean','Frequency': 'mean',
'MonetaryValue': ['mean', 'count'],}).round(0)
KMeans(n_clusters=3, random_state=1)
| Recency | Frequency | MonetaryValue | ||
|---|---|---|---|---|
| mean | mean | mean | count | |
| K_Cluster | ||||
| 0 | 171.0 | 15.0 | 293.0 | 1523 |
| 1 | 69.0 | 65.0 | 1167.0 | 1859 |
| 2 | 13.0 | 260.0 | 6559.0 | 956 |
Snake plots to understand and compare segments
rfm_normalized = pd.DataFrame(rfm_normalized,index=rfm_rfm.index,columns=rfm_rfm.columns)
rfm_normalized['K_Cluster'] = kc.labels_
rfm_normalized['General_Segment'] = rfm['General_Segment']
rfm_normalized.reset_index(inplace = True)
#Melt the data into a long format so RFM values and metric names are stored in 1 column each
rfm_melt = pd.melt(rfm_normalized,id_vars=['CustomerID','General_Segment','K_Cluster'],value_vars=['Recency', 'Frequency', 'MonetaryValue'],
var_name='Metric',value_name='Value')
rfm_melt.head()
| CustomerID | General_Segment | K_Cluster | Metric | Value | |
|---|---|---|---|---|---|
| 0 | 12346.0 | Sliver | 1 | Recency | 1.409982 |
| 1 | 12347.0 | Gold | 2 | Recency | -2.146578 |
| 2 | 12348.0 | Sliver | 1 | Recency | 0.383648 |
| 3 | 12349.0 | Gold | 1 | Recency | -0.574961 |
| 4 | 12350.0 | Bronze | 0 | Recency | 1.375072 |
f, (ax1, ax2) = plt.subplots(1,2, figsize=(15, 8))
sns.lineplot(x = 'Metric', y = 'Value', hue = 'General_Segment', data = rfm_melt,ax=ax1)
# a snake plot with K-Means
sns.lineplot(x = 'Metric', y = 'Value', hue = 'K_Cluster', data = rfm_melt,ax=ax2)
plt.suptitle("Snake Plot of RFM",fontsize=24) #make title fontsize subtitle
plt.show()
<AxesSubplot:xlabel='Metric', ylabel='Value'>
<AxesSubplot:xlabel='Metric', ylabel='Value'>
Text(0.5, 0.98, 'Snake Plot of RFM')
Relative importance of segment attributes
Useful technique to identify relative importance of each segment's attribute
Let’s try again with a heat map. Heat maps are a graphical representation of data where larger values were colored in darker scales and smaller values in lighter scales. We can compare the variance between the groups quite intuitively by colors.
# The further a ratio is from 0, the more important that attribute is for a segment relative to the total population
cluster_avg = rfm_rfm_k3.groupby(['K_Cluster']).mean()
population_avg = rfm_rfm.mean()
relative_imp = cluster_avg / population_avg - 1
relative_imp.round(2)
| Recency | Frequency | MonetaryValue | |
|---|---|---|---|
| K_Cluster | |||
| 0 | 0.85 | -0.84 | -0.86 |
| 1 | -0.25 | -0.28 | -0.43 |
| 2 | -0.86 | 1.87 | 2.20 |
# the mean value in total
total_avg = rfm.iloc[:, 0:3].mean()
# calculate the proportional gap with total mean
cluster_avg = rfm.groupby('General_Segment').mean().iloc[:, 0:3]
prop_rfm = cluster_avg/total_avg - 1
prop_rfm.round(2)
| Recency | Frequency | MonetaryValue | |
|---|---|---|---|
| General_Segment | |||
| Bronze | 1.08 | -0.83 | -0.87 |
| Gold | -0.78 | 1.49 | 1.56 |
| Sliver | -0.22 | -0.45 | -0.48 |
# heatmap with RFM
f, (ax1, ax2) = plt.subplots(1,2, figsize=(15, 5))
sns.heatmap(data=relative_imp, annot=True, fmt='.2f', cmap='Blues',ax=ax1)
ax1.set(title = "Heatmap of K-Means")
# a snake plot with K-Means
sns.heatmap(prop_rfm, cmap= 'Oranges', fmt= '.2f', annot = True,ax=ax2)
ax2.set(title = "Heatmap of RFM quantile")
plt.suptitle("Heat Map of RFM",fontsize=20) #make title fontsize subtitle
plt.show()
<AxesSubplot:ylabel='K_Cluster'>
[Text(0.5, 1.0, 'Heatmap of K-Means')]
<AxesSubplot:ylabel='General_Segment'>
[Text(0.5, 1.0, 'Heatmap of RFM quantile')]
Text(0.5, 0.98, 'Heat Map of RFM')